# If the GEOquery R/Biocondcutor package is not installed, use biocLite() to install the package:
if (!require("GEOquery")){
  source("http://bioconductor.org/biocLite.R")
  biocLite("GEOquery")
}
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
if(!require("cBioPortalData"))
  BiocManager::install("cBioPortalData")
library(cBioPortalData)

# Load the GEOquery R/Bioconductor package:
library(GEOquery)

########## Dataset 1: GSE19783
#To access GEO Dataset (GDS), use the function getGEO() which returns a list of ExpressionSets:
gse <- getGEO("GSE19783", GSEMatrix = TRUE)
show(gse)

save(gse, file = "/home/esaha/BONOBO/data/miRNA_mRNA_breastCancer/GSE19783.RData")

mRNA = data.frame(gse$`GSE19783-GPL6480_series_matrix.txt.gz`)
miRNA = data.frame(gse$`GSE19783-GPL8227_series_matrix.txt.gz`)

# phenotypic data
phenotype1 = pData(gse$`GSE19783-GPL6480_series_matrix.txt.gz`)
phenotype2 = pData(gse$`GSE19783-GPL8227_series_matrix.txt.gz`)

id1 = unlist(lapply(strsplit(phenotype1$title, split = "-"), function(x){substr(x[2], 1, 3)}))
id2 = unlist(lapply(strsplit(phenotype2$title, split = "-"), function(x){substr(x[2], 1, 3)}))
id = intersect(id1, id2)
mRNA_id = phenotype1$geo_accession[match(id,id1)]
miRNA_id = phenotype2$geo_accession[match(id,id2)]

# Keep samples with both mRNA and miRNA expression
mRNA_expr = t(mRNA[match(mRNA_id, rownames(mRNA)),])
miRNA_expr = t(miRNA[match(miRNA_id, rownames(miRNA)),])

# Remove NA
rowsums = apply(miRNA_expr, MARGIN = 1, function(x){sum(as.numeric(as.character(x)))})
miRNA_expr = miRNA_expr[-which(rowsums == 0 | is.na(rowsums)),]

# Remove genes with all 0
rowsums = apply(mRNA_expr, MARGIN = 1, function(x){sum(as.numeric(as.character(x)))})
mRNA_expr = mRNA_expr[-which(rowsums == 0 | is.na(rowsums)),]

# Get featureData
f1 = fData(gse$`GSE19783-GPL6480_series_matrix.txt.gz`)
f2 = fData(gse$`GSE19783-GPL8227_series_matrix.txt.gz`)
gene = f1$GENE_SYMBOL[match(rownames(mRNA_expr), f1$ID)]
noncoding = unlist(lapply(strsplit(f2$miRNA_ID[f2$miRNA_ID == f2$ID], split = "-"), function(x){paste(x, collapse = ".")}))
noncoding = intersect(rownames(miRNA_expr), noncoding)

# Keep only miRNA expression, removing phenotypic data from assays object
miRNA_expr = miRNA_expr[match(noncoding, rownames(miRNA_expr)),]

# For mRNA expression, keep thec probe with the maximum variance
probewise_sd = unlist(apply(mRNA_expr, MARGIN = 1, sd))
df = data.frame(cbind(gene, probewise_sd, names(probewise_sd)))
# df %>% group_by(gene_symbols) %>% top_n(1, probewise_sd)
library(dplyr)
filtered = df %>% group_by(gene) %>% dplyr::slice(which.max(probewise_sd))
genes_filtered = filtered$gene
probes_filtered = filtered$V3
mRNA_expr = mRNA_expr[which(rownames(mRNA_expr) %in% probes_filtered),]
rownames(mRNA_expr) = genes_filtered[match(rownames(mRNA_expr), probes_filtered)]
mRNA_expr = data.frame(mRNA_expr)
head(mRNA_expr)[,1:5]

# Check chromosome locations of mRNA
f1_subset = f1[match(rownames(mRNA_expr), f1$GENE_SYMBOL),]
chr_loc = unlist(lapply(strsplit(f1_subset$CHROMOSOMAL_LOCATION, split = ":"), function(x){substr(x[1],1,4)}))
table(chr_loc)

# Remove Y genes and unmapped genes.
mRNA_expr = mRNA_expr[-which(chr_loc %in% c("chrY", "unma")),]

# Join mRNA and miRNA
colnames(miRNA_expr) = colnames(mRNA_expr)
expr = rbind(mRNA_expr, miRNA_expr)

# Finalize phenotypic data
phenotype = phenotype1[match(colnames(expr), rownames(phenotype1)),]

write.table(expr, "/home/esaha/BONOBO/data/miRNA_mRNA_breastCancer/GSE19783_miRNA_mRNA_expression.txt", row.names=T, col.names=T)
write.table(phenotype, "/home/esaha/BONOBO/data/miRNA_mRNA_breastCancer/GSE19783_phenotype.txt", row.names=T, col.names=T)

